In this script, we set up the methods for analyzing the association between cell counts and an outcome of interest. We will use calendar age as an example outcome to see what happens using several methods.

We show the following methods:

  • Univariable regression (age ~ cell count)
  • Multivariable regression (age ~ cc1 + cc2 + cc3…)
  • Principal component regression (age ~ PC1 + PC2 + PC3…)

We also make some plots of the distribution of variables in our dataset.

Distributions

Load sample sheet.

setwd("/exports/molepi/RSC_BIOS/Users/tjonkman/cellcounts")
.libPaths("/exports/molepi/RSC_BIOS/Users/tjonkman/Packages/4.3.1")

load("01_sample_sheet.rda")
dim(ss)
## [1] 4058   27
dim(cc)
## [1] 4058   12
# Set the variable of interest. In this case: calendar age.
ss$var <- ss$age

#Print cohort characteristics.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ss %>% 
  summarise(
    samples = n(),
    age.mean = round(mean(age), 1),
    age.sd = round(sd(age), 1),
    age.range = paste0(min(age), "-", max(age)),
    male.female.perc = paste0(round(mean(sex == "male") * 100), "%/", round(mean(sex == "female") * 100), "%")
  )
##   samples age.mean age.sd age.range male.female.perc
## 1    4058     51.2   16.4     18-87          42%/58%
ss %>% 
  group_by(study) %>% 
  summarise(
    samples = n(),
    age.mean = round(mean(age), 1),
    age.sd = round(sd(age), 1),
    age.range = paste0(min(age), "-", max(age)),
    male.female.perc = paste0(round(mean(sex == "male") * 100), "%/", round(mean(sex == "female") * 100), "%")
  )
## # A tibble: 6 × 6
##   study samples age.mean age.sd age.range male.female.perc
##   <fct>   <int>    <dbl>  <dbl> <chr>     <chr>           
## 1 CODAM     154     65.6    7   50-79     54%/46%         
## 2 LL        811     46.8   13.6 18-81     43%/57%         
## 3 LLS       714     58.3    6.8 30-79     48%/52%         
## 4 NTR      1399     37.6   13.9 18-79     34%/66%         
## 5 PAN       173     62.3    9.6 37-87     61%/39%         
## 6 RS        807     67.6    7.1 38-87     43%/57%

Show age distribution.

library(ggplot2)
ggplot(ss, aes(x = age)) + 
  geom_histogram(fill = "#005580", color = "#ffffff", binwidth = 5, breaks = seq(15, 90, 5)) + 
  theme_bw() + 
  scale_x_continuous(breaks = seq(15, 90, 5)) + 
  scale_y_continuous(breaks = seq(0, 600, 100)) + 
  theme(panel.grid.minor = element_blank()) +
  labs(x = "Age (years)", y = "Frequency")

Show cell count distribution.

distr <- data.frame(
  row.names = colnames(ss[,11:22]),
  mean = round(apply(ss[,11:22], 2, mean), 1),
  sd = round(apply(ss[,11:22], 2, sd), 1)
)

#Prepare data for plotting.
celltypes <- c("Neutrophils", "Eosinophils", "Basophils", "Monocytes", "Naive B cells", "Memory B cells", "Naive CD4 T cells", "Memory CD4 T cells", "Naive CD8 T cells", "Memory CD8 T cells", "Regulatory T cells", "Natural Killer cells")
rownames(distr) <- celltypes
distr$Fraction <- factor(rownames(distr), levels = rownames(distr))
distr$mean.lab <- format(round(distr$mean, 1), 1)
distr
##                      mean   sd             Fraction mean.lab
## Neutrophils          56.8 12.1          Neutrophils     56.8
## Eosinophils           1.0  1.3          Eosinophils      1.0
## Basophils             2.8  0.7            Basophils      2.8
## Monocytes             6.2  2.8            Monocytes      6.2
## Naive B cells         2.6  1.8        Naive B cells      2.6
## Memory B cells        2.1  0.9       Memory B cells      2.1
## Naive CD4 T cells     5.0  3.6    Naive CD4 T cells      5.0
## Memory CD4 T cells    3.9  3.1   Memory CD4 T cells      3.9
## Naive CD8 T cells     2.6  2.5    Naive CD8 T cells      2.6
## Memory CD8 T cells    6.1  3.7   Memory CD8 T cells      6.1
## Regulatory T cells    3.4  1.0   Regulatory T cells      3.4
## Natural Killer cells  7.4  3.2 Natural Killer cells      7.4
#Define colors.
ct.palette <- c("#cc3333", "#a32929", "#7a1f1f", "#e68019", 
                "#5acc56", "#439940",
                "#66baff", "#5295cc", "#3d7099", "#294a66", 
                "#5962b3", "#984ea3"
                )

#Make pie chart.
library(shadowtext)
ggplot(distr, aes(x = "", y = mean, fill = Fraction, label = mean.lab)) +
  geom_bar(stat = "identity", color = "white") +
  geom_shadowtext(aes(label = mean.lab, y = mean, x = 1.35), position = position_stack(vjust = 0.5), color = "black", bg.color = "white", size = 4) +
  coord_polar("y", start = 0) +
  theme_minimal()+
  theme(axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank()) +
  scale_fill_manual(values = ct.palette)

Plot correlations of cell types with age.

plot.data <- ss

library(reshape2)
plot.melt <- melt(plot.data[,c(1:4, 11:22)], id.vars = c("id", "age", "sex", "study"), variable.name = "Fraction", value.name = "Percentage")
levels(plot.melt$Fraction) <- celltypes
# head(plot.melt)

annots <- data.frame(
  Fraction = celltypes,
  age.cor = paste0("R: ", as.numeric(round(cor(plot.data$age, plot.data[,11:22]), 2)))
)
annots$Fraction <- factor(annots$Fraction, levels = unique(annots$Fraction))

ggplot(plot.melt, aes(x = age, y = Percentage, color = Fraction)) +
  geom_point(shape = 1) +
  geom_smooth(method = "lm", color = "#222222") +
  facet_wrap(facets = vars(Fraction), scales = "free", nrow = 2, labeller = labeller(Fraction = as_labeller(1:12))) +
  theme_bw() +
  scale_color_manual(values = ct.palette) +
  geom_shadowtext(data = annots, aes(x = 18, y = Inf, label = age.cor, color = Fraction), hjust = 0, vjust = 1.5, size = 4, color = "black", bg.color = "white") +
  guides(color= "none") +
  labs(x = "Age (years)") +
  scale_x_continuous(breaks = c(0, 20, 40, 60, 80))
## `geom_smooth()` using formula = 'y ~ x'

Example of DNAmAge vs AgeAccel

Show an example of 3 people: young, old and a young person who is considered older by the clock.

#Select 3 subjects for highlighting:
#A: 50 years old, horvath DNAmAge of 50 (young).
#B: 60 years old, horvath DNAmAge of 60 (old).
#C: 50 years old, horvath DNAmAge of 60 ("biologically old").

highlight.subjects <- ss[c(1072, 261, 184), c("age", "horvath")]
highlight.subjects$horvath <- round(highlight.subjects$horvath)
# highlight.subjects

#Select 50 random subjects for the example plot, plus the 3 highlighted subjects.
set.seed(1)
ss.subset <- ss[(ss$age > 40 & ss$age < 70),]
ss.subset <- ss[sample(x = rownames(ss.subset), size = 100), c("age", "horvath")]
ss.subset <- rbind(ss.subset, highlight.subjects)
# my.limits <- c(min(ss.subset[,c("age", "horvath")]), max(ss.subset[,c("age", "horvath")]))
my.limits <- c(40, 70)

highlight.subjects$subject.no <- c("A", "B", "C")

#Plot.
ggplot(ss.subset, aes(x = age, y = horvath)) + 
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  geom_point(size = 2, color = "#999999", alpha = 1) +
  lims(x = my.limits, y = my.limits) +
  labs(x = "Calendar age (years)", y = "DNAmAge (years)") +
  theme_bw() +
  geom_segment(x = 50, xend = 60, y = 60, yend = 60, linetype = "dashed", color = "black") +
  geom_segment(x = 50, xend = 50, y = 50, yend = 60, linetype = "dashed", color = "black") + 
  geom_point(data = highlight.subjects, mapping = aes(x = age, y = horvath), color = c("#009900", "#004499", "#990000"), size = 4) +
  geom_point(data = highlight.subjects, mapping = aes(x = age, y = horvath), color = c("#11CC11", "#1166CC", "#CC1111"), size = 2.5) + 
  geom_shadowtext(data = highlight.subjects, mapping = aes(x = age, y = horvath, label = subject.no), color = c("#11CC11", "#1166CC", "#CC1111"), bg.color = "white", size = 6, hjust = c(1.6, -0.6, 1.6), fontface = "bold")
## Warning: Removed 8 rows containing missing values (`geom_point()`).

Run several tests to investigate the age effect of cell types.

3 methods will be used:

  • Univariable regression (age ~ cell count)
  • Multivariable regression (age ~ cc1 + cc2 + cc3…)
  • Principal component regression (age ~ PC1 + PC2 + PC3…)

Method 1: Univariable regression

#Make a placeholder column for storing a single cell type in.
ss$ct <- NaN

#Make a dataframe to store results.
res <- data.frame(
  row.names = colnames(cc),
  cell.type = celltypes,
  estimate = NaN,
  std.error = NaN,
  t.stat = NaN,
  p.val = NaN,
  expl.var = NaN
)
# res

for(i in 1:ncol(cc)){
  
  #Select 1 cell type.
  ct <- colnames(cc)[i]
  ss$ct <- ss[,ct]
  
  #Test association with var.
  fit <- lm(age ~ ct, data = ss)
  
  #Store results
  s <- summary(fit)
  res[i, 2:5] <- s$coefficients["ct",]
  
  #Which percentage of the variance of the clock is explained by this cell type?
  res$expl.var[i] <- s$r.squared
}

res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 2)
res[,4] <- round(res[,4], 1)
res[,6] <- round(res[,6], 3)

#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)

res$cell.type <- factor(res$cell.type, levels = res$cell.type)

res.1ct <- res
res.1ct
##                    cell.type estimate std.error t.stat        p.val expl.var
## Neu              Neutrophils     0.05      0.02    2.4 1.568890e-02    0.001
## Eos              Eosinophils     0.21      0.20    1.0 2.943382e-01    0.000
## Baso               Basophils     1.41      0.38    3.7 2.516023e-04    0.003
## Mono               Monocytes     0.79      0.09    8.6 1.270487e-17    0.018
## Bnv            Naive B cells    -0.75      0.14   -5.2 2.277494e-07    0.007
## Bmem          Memory B cells     1.37      0.27    5.0 5.068212e-07    0.006
## CD4Tnv     Naive CD4 T cells    -1.11      0.07  -15.9 3.857653e-55    0.059
## CD4Tmem   Memory CD4 T cells     1.24      0.08   15.3 3.201256e-51    0.054
## CD8Tnv     Naive CD8 T cells    -3.88      0.08  -46.4 0.000000e+00    0.346
## CD8Tmem   Memory CD8 T cells     0.76      0.07   11.0 7.210563e-28    0.029
## Treg      Regulatory T cells    -2.82      0.25  -11.5 3.832852e-30    0.032
## NK      Natural Killer cells     0.57      0.08    7.0 2.994033e-12    0.012
##         ci.lower ci.upper
## Neu       0.0108   0.0892
## Eos      -0.1820   0.6020
## Baso      0.6652   2.1548
## Mono      0.6136   0.9664
## Bnv      -1.0244  -0.4756
## Bmem      0.8408   1.8992
## CD4Tnv   -1.2472  -0.9728
## CD4Tmem   1.0832   1.3968
## CD8Tnv   -4.0368  -3.7232
## CD8Tmem   0.6228   0.8972
## Treg     -3.3100  -2.3300
## NK        0.4132   0.7268
#Make the plot.
plot.theme <- list(
  geom_hline(yintercept = 0, linetype = "dashed"), 
  geom_point(color = "#005580", shape = 19, size = 1.5), 
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.4, color = "#005580"), 
  labs(x = NULL, y = "Age effect (years per %pt.)"), 
  theme_bw(),
  guides(color = "none"), 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

ggplot(res.1ct, aes(x = cell.type, y = estimate)) + 
  plot.theme

To illustrate that the cell types are strongly correlated with each other, make a correlation heatmap.

cor.cc <- cor(as.matrix(cc))
rownames(cor.cc) <- celltypes
colnames(cor.cc) <- celltypes

#Convert the correlation matrix to a long format for plotting.
cor.cc <- melt(cor.cc, value.name = "Correlation")

#Create the heatmap.
ggplot(cor.cc, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile() +
  geom_text(aes(label = format(round(Correlation, 1), 1)), vjust = 0.5, hjust = 0.5, size = 2.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = 0, limit = c(-1, 1)) +
  labs(x = NULL, y = NULL) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  guides(fill = "none")

To demonstrate that these correlations lead to ambiguous effect sizes, show what happens when you correct the age effect of neutrophils for different cell types.

#Make a placeholder column for storing a single cell type in.
ss$ct <- NaN

#Make a dataframe to store results.
res <- data.frame(
  row.names = c("none", colnames(cc[,-1])),
  cell.type = c("none", celltypes[2:12]),
  estimate = NaN,
  std.error = NaN,
  t.stat = NaN,
  p.val = NaN
)
# res

#Fill in the unadjusted neutrophil age effect.
res[1,2:5] <- res.1ct["Neu", 2:5]

#Fill in the neutrophil age effects, each time adjusted for another cell type.
for(i in 1:ncol(cc[,-1])){
  
  #Select 1 cell type.
  ct <- colnames(cc[,-1])[i]
  ss$ct <- ss[,ct]
  
  #Test association with var.
  fit <- lm(age ~ Neu + ct, data = ss)
  
  #Store results
  s <- summary(fit)
  res[i+1, 2:5] <- s$coefficients["Neu",]
}

res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 2)
res[,4] <- round(res[,4], 1)

#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)

res$cell.type <- factor(res$cell.type, levels = res$cell.type)

res.neu.corr <- res
res.neu.corr
##                    cell.type estimate std.error t.stat        p.val ci.lower
## none                    none     0.05      0.02    2.4 1.568890e-02   0.0108
## Eos              Eosinophils     0.09      0.03    3.6 3.325271e-04   0.0312
## Baso               Basophils     0.17      0.03    6.2 6.168720e-10   0.1112
## Mono               Monocytes     0.18      0.02    7.7 2.267858e-14   0.1408
## Bnv            Naive B cells    -0.01      0.02   -0.4 7.154132e-01  -0.0492
## Bmem          Memory B cells     0.14      0.02    5.8 7.512775e-09   0.1008
## CD4Tnv     Naive CD4 T cells    -0.14      0.02   -5.9 4.862618e-09  -0.1792
## CD4Tmem   Memory CD4 T cells     0.35      0.02   14.0 1.115648e-43   0.3108
## CD8Tnv     Naive CD8 T cells    -0.30      0.02  -16.5 4.003626e-59  -0.3392
## CD8Tmem   Memory CD8 T cells     0.20      0.02    8.4 5.172977e-17   0.1608
## Treg      Regulatory T cells    -0.01      0.02   -0.2 8.067200e-01  -0.0492
## NK      Natural Killer cells     0.21      0.03    8.1 1.019528e-15   0.1512
##         ci.upper
## none      0.0892
## Eos       0.1488
## Baso      0.2288
## Mono      0.2192
## Bnv       0.0292
## Bmem      0.1792
## CD4Tnv   -0.1008
## CD4Tmem   0.3892
## CD8Tnv   -0.2608
## CD8Tmem   0.2392
## Treg      0.0292
## NK        0.2688
#Add the models.
res.neu.corr$model <- c("Neu", "Neu + Eos", "Neu + Baso", "Neu + Mono", "Neu + Bnv", "Neu + Bmem", "Neu + CD4Tnv", "Neu + CD4Tmem", "Neu + CD8Tnv", "Neu + CD8Tmem", "Neu + Treg", "Neu + NK")
res.neu.corr$model <- factor(res.neu.corr$model, levels = unique(res.neu.corr$model))

#Make the plot.
ggplot(res.neu.corr, aes(x = model, y = estimate)) +
  geom_hline(yintercept = res.neu.corr["none", "estimate"], linetype = "dashed", color = "#333333", alpha = 1, linewidth = 0.5) + 
  geom_point(color = "#005580", shape = 19, size = 1.5) + 
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.4, color = "#005580") + 
  labs(y = "Neutrophil age effect (years per %pt.)") + 
  theme_bw() + 
  guides(color = "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())


Method 2: Multivariable regression

#Empty dataframe for storing the model intercepts.
ints <- data.frame(
  row.names = paste0("No ", celltypes),
  ct.left.out = paste0("No ", celltypes),
  intercept = rep(NaN, 12)
)

corr.12ct <- function(form, ct.left.out){
  
  #Test association with age.
  fit <- lm(form, data = ss)
  
  #Add a dummy entry for the cell type that was set to NA.
  na.ct <- data.frame(
    row.names = names(fit$coefficients[13]),
    estimate = NaN,
    std.error = NaN,
    t.stat = NaN,
    p.val = NaN,
    ci.lower = NaN,
    ci.upper = NaN
    )
  
  #Store results.
  s <- summary(fit)
  res <- as.data.frame(s$coefficients)
  
  colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")

  #Calculate 95% confidence intervals.
  res$ci.lower <- res$estimate - (1.96 * res$std.error)
  res$ci.upper <- res$estimate + (1.96 * res$std.error)
  
  res[,1] <- round(res[,1], 2)
  res[,2] <- round(res[,2], 2)
  res[,3] <- round(res[,3], 1)
  # res[,4] <- format(res[,4], scientific = T, digits = 2)
  res[,5] <- round(res[,5], 2)
  res[,6] <- round(res[,6], 2)
  
  #Add the intercept to the dataframe.
  ints[ct.left.out, 2] <<- round(res[1,1], 1)
  
  # #Remove the intercept effect, selecting only cell type effects.
  # print(res[1,])
  res <- res[-(1),]
  
  #Add the cell type that was set to NA in this model.
  res <- rbind(res, na.ct)
  
  #Re-order columns.
  res <- res[c("Neu", "Eos", "Baso", "Mono", "Bnv", "Bmem", "CD4Tnv", "CD4Tmem", "CD8Tnv", "CD8Tmem", "Treg", "NK"),]
  rownames(res) <- celltypes
  
  res$ct <- factor(rownames(res), levels = rownames(res))
  res$ct.left.out <- ct.left.out
  
  return(res)
}

res.no.neu     <- corr.12ct(formula(age ~ Baso + Eos + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu), "No Neutrophils")
res.no.eos     <- corr.12ct(formula(age ~ Baso + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu + Eos), "No Eosinophils")
res.no.baso    <- corr.12ct(formula(age ~ NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu + Eos + Baso), "No Basophils")
res.no.mono    <- corr.12ct(formula(age ~ Baso + Eos + Neu + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono), "No Monocytes")
res.no.bnv     <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv), "No Naive B cells")
res.no.bmem    <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem), "No Memory B cells")
res.no.cd4tnv  <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv), "No Naive CD4 T cells")
res.no.cd4tmem <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem), "No Memory CD4 T cells")
res.no.cd8tnv  <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + NK + Treg + CD8Tmem + CD8Tnv), "No Naive CD8 T cells")
res.no.cd8tmem <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + NK + Treg + CD8Tmem), "No Memory CD8 T cells")
res.no.treg    <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + NK + Treg), "No Regulatory T cells")
res.no.nk      <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK), "No Natural Killer cells")

#Plot the effect sizes of each cell type for each model. This is supposed to illustrate that these fluctuate wildly based on which cell type you leave out.
res.12ct <- rbind(res.no.neu, res.no.eos, res.no.baso, res.no.mono, res.no.bnv, res.no.bmem, res.no.cd4tnv, res.no.cd4tmem, res.no.cd8tnv, res.no.cd8tmem, res.no.treg, res.no.nk)
res.12ct$ct.left.out <- factor(res.12ct$ct.left.out, levels = unique(res.12ct$ct.left.out))
head(res.12ct)
##                estimate std.error t.stat        p.val ci.lower ci.upper
## Neutrophils         NaN       NaN    NaN          NaN      NaN      NaN
## Eosinophils       -0.06      0.18   -0.3 7.430640e-01    -0.40     0.29
## Basophils          2.88      0.40    7.2 5.894134e-13     2.10     3.66
## Monocytes          0.42      0.08    5.5 4.912255e-08     0.27     0.57
## Naive B cells     -0.23      0.12   -1.9 6.133143e-02    -0.47     0.01
## Memory B cells     0.49      0.25    2.0 4.873306e-02     0.00     0.97
##                            ct    ct.left.out
## Neutrophils       Neutrophils No Neutrophils
## Eosinophils       Eosinophils No Neutrophils
## Basophils           Basophils No Neutrophils
## Monocytes           Monocytes No Neutrophils
## Naive B cells   Naive B cells No Neutrophils
## Memory B cells Memory B cells No Neutrophils
#Test the proportion of variance explained by this method, compared to an intercept-only model.
#NOTE: this only has to be done once. The total effect of each 11-celltype model is exactly the same.
fit <- lm(formula = age ~ Neu + Eos + Baso + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK, data = ss)
summary(fit)
## 
## Call:
## lm(formula = age ~ Neu + Eos + Baso + Mono + Bnv + Bmem + CD4Tnv + 
##     CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK, data = ss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.263  -8.510   0.755   8.696  43.848 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.30593    6.84405   8.958  < 2e-16 ***
## Neu         -0.10786    0.06922  -1.558   0.1192    
## Eos         -0.16578    0.19598  -0.846   0.3977    
## Baso         2.76971    0.41016   6.753 1.66e-11 ***
## Mono         0.31098    0.11310   2.750   0.0060 ** 
## Bnv         -0.33896    0.14263  -2.377   0.0175 *  
## Bmem         0.38086    0.25652   1.485   0.1377    
## CD4Tnv      -0.42876    0.09307  -4.607 4.21e-06 ***
## CD4Tmem      0.93182    0.10815   8.616  < 2e-16 ***
## CD8Tnv      -3.90102    0.11033 -35.359  < 2e-16 ***
## CD8Tmem     -0.11937    0.10272  -1.162   0.2453    
## Treg        -1.12477    0.23974  -4.692 2.80e-06 ***
## NK                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.37 on 4046 degrees of freedom
## Multiple R-squared:  0.435,  Adjusted R-squared:  0.4335 
## F-statistic: 283.2 on 11 and 4046 DF,  p-value: < 2.2e-16
var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")
var.ex
## [1] "R²: 43.5%"
ints$intercept <- paste0("Intercept: ", ints$intercept)
ints$ct.left.out <- factor(ints$ct.left.out, levels = ints$ct.left.out)

#Plot the effect sizes for each of the 12 models, each time leaving out a different cell type.
#Print the intercepts at the top left, and the variance explained at the top right.
ggplot(res.12ct, aes(x = ct, y = estimate)) +
  plot.theme +
  geom_vline(data = subset(res.12ct, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
  facet_wrap(facets = vars(ct.left.out), scales = "free") +
  geom_text(data = ints, aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
  annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
  scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 12 rows containing missing values (`geom_point()`).

Print separated versions of the results without naive CD8 T cells or without basophils, as examples.

ex.mod = "No Naive CD8 T cells"
res.ex <- res.12ct[res.12ct$ct.left.out == ex.mod,]
ggplot(res.ex, aes(x = ct, y = estimate)) +
  plot.theme +
  geom_vline(data = subset(res.ex, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
  geom_text(data = ints[rownames(ints) == ex.mod,], aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
  annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
  scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 1 rows containing missing values (`geom_point()`).

ex.mod = "No Basophils"
res.ex <- res.12ct[res.12ct$ct.left.out == ex.mod,]
ggplot(res.ex, aes(x = ct, y = estimate)) +
  plot.theme +
  geom_vline(data = subset(res.ex, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
  geom_text(data = ints[rownames(ints) == ex.mod,], aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
  annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
  scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 1 rows containing missing values (`geom_point()`).


Method 3: Principal component regression

First, calculate PCs.

#Calculate PCs in the cell counts.
set.seed(1)
pca <- prcomp(cc, scale = TRUE)

#Scree plot.
var.ex <- pca$sdev^2 / sum(pca$sdev^2)
scree.dat <- data.frame(PC = 1:length(var.ex), variance = var.ex, round.var = format(round(var.ex, 2)))
scree.dat <- scree.dat[1:11,]
ggplot(scree.dat, aes(x = PC, y = variance, label = round.var)) +
  geom_point(size = 2, color = "#005580") +
  geom_line(linewidth = 1, color = "#005580") +
  geom_text(hjust = -0.1, vjust = -0.4, size = 3.5) +
  labs(x = "Principal Component", y = "Proportion of variance explained") +
  scale_x_continuous(limits  = c(0.5, 11.5), breaks = 1:11) +
  theme_bw() +
  theme(panel.grid.minor = element_blank())

#Biplot.
biplot(pca, cex = 1, choices = c(1,2))

# Create a data frame with the principal component scores.
pca.df <- as.data.frame(pca$x[,1:12])

# Scatter plot of PC1 vs. PC2 with variance explained labeled.
ggplot(pca.df, aes(x = PC1, y = PC2)) +
  geom_point(shape = 1) +
  labs(x = paste0("PC1: ", round(var.ex[1]*100, 1), "%"), y = paste0("PC2: ", round(var.ex[2]*100, 1), "%")) +
  ggtitle("") +
  theme_bw()

Reverse PC sign if necessary to make the interpretation more intuitive.

#Correlations between PCs and cell counts.
corr.pcs <- cor(pca.df, cc)
colnames(corr.pcs) <- celltypes

#If the correlation of this fraction is below 0, inverse the sign of the PC so that it becomes positive.
for(i in 1:12){
  
  j <- which(corr.pcs[i,]^2 == max(corr.pcs[i,]^2))
  
  if(corr.pcs[i,j] == min(corr.pcs[i,])){
    
    pca.df[,i] <- -(pca.df[,i])
    pca$rotation[,i] <- -(pca$rotation[,i])
    corr.pcs[i,] <- -(corr.pcs[i,])
    
  }
}

#Check if the most explanatory cell type is always positive. Should be 100% TRUE.
for(i in 1:12){
  
  j <- which(corr.pcs[i,]^2 == max(corr.pcs[i,]^2))
  print(paste0(rownames(corr.pcs)[i], ": ", corr.pcs[i,j] == max(corr.pcs[i,])))

}
## [1] "PC1: TRUE"
## [1] "PC2: TRUE"
## [1] "PC3: TRUE"
## [1] "PC4: TRUE"
## [1] "PC5: TRUE"
## [1] "PC6: TRUE"
## [1] "PC7: TRUE"
## [1] "PC8: TRUE"
## [1] "PC9: TRUE"
## [1] "PC10: TRUE"
## [1] "PC11: TRUE"
## [1] "PC12: TRUE"
save(pca.df, file = "02_pca_df.rda")

Check how the PCs are correlated with individual cell counts.

plot.data <- cbind(pca.df, cc)

#Scatter plot of principal components with all 12 cell types.
plot.data <- melt(plot.data, id.vars = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12"), variable.name = "cell.type", value.name = "proportion")

round(cor(pca.df, cc), 1)
##       Neu  Eos Baso Mono  Bnv Bmem CD4Tnv CD4Tmem CD8Tnv CD8Tmem Treg   NK
## PC1   1.0 -0.6 -0.7 -0.5 -0.5 -0.6   -0.4    -0.6   -0.3    -0.4 -0.3 -0.5
## PC2   0.0  0.0  0.0 -0.3  0.3 -0.2    0.7    -0.2    0.6    -0.6  0.4 -0.3
## PC3   0.1  0.0  0.3 -0.3 -0.3  0.2   -0.2    -0.5   -0.1     0.4  0.7  0.1
## PC4   0.1  0.0  0.0 -0.5 -0.1  0.6    0.1     0.4    0.0     0.1 -0.1 -0.4
## PC5  -0.1 -0.5 -0.1 -0.3  0.5 -0.1    0.0     0.0    0.1     0.4 -0.1  0.1
## PC6  -0.1 -0.1 -0.3  0.0 -0.4  0.2    0.1    -0.1    0.5     0.1 -0.2  0.3
## PC7  -0.1 -0.3  0.0 -0.1 -0.2  0.0    0.4     0.2   -0.4    -0.2  0.1  0.4
## PC8   0.0  0.5 -0.1 -0.5  0.0 -0.3    0.0     0.1    0.0     0.0 -0.1  0.3
## PC9  -0.1  0.1 -0.1  0.1 -0.1 -0.1    0.4    -0.2   -0.2     0.4 -0.1 -0.3
## PC10  0.0  0.1 -0.3  0.0  0.2  0.4    0.1    -0.3   -0.2    -0.1  0.0  0.1
## PC11  0.1  0.0  0.4 -0.1  0.0  0.1    0.1    -0.2    0.0    -0.1 -0.3  0.1
## PC12  0.7 -0.5 -0.5 -0.3 -0.4 -0.5   -0.2    -0.6   -0.2    -0.4 -0.1 -0.5
library(ggpubr)
plot.theme <- list(
  geom_point(shape = 1),
  geom_smooth(method = "lm"),
  stat_cor(method = "pearson", cor.coef.name = "R", label.x = -Inf, label.y = 105, hjust = 0, vjust = 0.7, size = 3.5, r.digits = 1, label.sep = ",  "),
  facet_wrap(facets = vars(cell.type)),
  theme_bw(),
  theme(strip.text = element_text(size = 10)),
  scale_y_continuous(limits = c(-10, 105), breaks = seq(-25, 100, 25))
)

#Correlations between PCs and cell counts.
corr.pcs <- corr.pcs[1:11,]

corr.pcs <- melt(corr.pcs, varnames = c("PC", "cell.type"), value.name = "Correlation")
corr.pcs$cell.type <- factor(corr.pcs$cell.type, levels = rev(levels(corr.pcs$cell.type)))

ggplot(corr.pcs, aes(x = PC, y = cell.type, fill = Correlation)) +
  geom_tile() +
  geom_text(aes(label = format(round(Correlation, 1), digits = 1)), vjust = 0.5, hjust = 0.5, size = 2.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       midpoint = 0, limit = c(-1, 1)) +
  labs(x = NULL, y = NULL) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  guides(fill = "none")

Include the PCs into the model, and plot the results.

ss.pc <- cbind(ss, pca.df)

fit <- lm(formula = age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11, data = ss.pc)
summary(fit)
## 
## Call:
## lm(formula = age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + 
##     PC8 + PC9 + PC10 + PC11, data = ss.pc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.263  -8.510   0.755   8.696  43.848 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.15771    0.19421 263.411  < 2e-16 ***
## PC1          0.02333    0.09792   0.238  0.81169    
## PC2         -6.46157    0.14945 -43.237  < 2e-16 ***
## PC3         -0.72825    0.17574  -4.144 3.49e-05 ***
## PC4          0.99617    0.19501   5.108 3.40e-07 ***
## PC5         -1.95336    0.22413  -8.715  < 2e-16 ***
## PC6         -6.28966    0.22680 -27.732  < 2e-16 ***
## PC7          4.35027    0.23650  18.395  < 2e-16 ***
## PC8         -0.44157    0.24144  -1.829  0.06749 .  
## PC9          0.83518    0.26370   3.167  0.00155 ** 
## PC10         0.09467    0.29267   0.323  0.74636    
## PC11         0.76573    0.31814   2.407  0.01613 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.37 on 4046 degrees of freedom
## Multiple R-squared:  0.435,  Adjusted R-squared:  0.4335 
## F-statistic: 283.2 on 11 and 4046 DF,  p-value: < 2.2e-16
var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")

res <- as.data.frame(summary(fit)$coefficients)
colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")

#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)

res[,1] <- round(res[,1], 2)
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 1)
res[,5] <- round(res[,5], 2)
res[,6] <- round(res[,6], 2)

# #Remove the intercept and the age effect, selecting only cell type effects.
pc.int <- res[1,1]
res <- res[-(1),]

res$PC <- factor(rownames(res), levels = rownames(res))

res.pcs <- res
res.pcs
##      estimate std.error t.stat         p.val ci.lower ci.upper   PC
## PC1      0.02      0.10    0.2  8.116918e-01    -0.17     0.22  PC1
## PC2     -6.46      0.15  -43.2  0.000000e+00    -6.75    -6.17  PC2
## PC3     -0.73      0.18   -4.1  3.486108e-05    -1.07    -0.38  PC3
## PC4      1.00      0.20    5.1  3.399769e-07     0.61     1.38  PC4
## PC5     -1.95      0.22   -8.7  4.155524e-18    -2.39    -1.51  PC5
## PC6     -6.29      0.23  -27.7 4.057491e-155    -6.73    -5.85  PC6
## PC7      4.35      0.24   18.4  1.229801e-72     3.89     4.81  PC7
## PC8     -0.44      0.24   -1.8  6.749274e-02    -0.91     0.03  PC8
## PC9      0.84      0.26    3.2  1.550561e-03     0.32     1.35  PC9
## PC10     0.09      0.29    0.3  7.463566e-01    -0.48     0.67 PC10
## PC11     0.77      0.32    2.4  1.613492e-02     0.14     1.39 PC11
plot.theme <-   list(
  geom_hline(yintercept = 0, linetype = "dashed"), 
  geom_point(color = "#005580", shape = 19, size = 1.5), 
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), color = "#005580", width = 0.4), 
  geom_vline(data = subset(res.pcs, is.na(estimate)), aes(xintercept = PC), color = "red", alpha = 0.5, linetype = "dashed"), 
  labs(x = NULL, y = "Age effect"), 
  theme_bw(), 
  guides(color = "none"),
  scale_y_continuous(limits = c(-8.5, 5.0), breaks = seq(-10, 10, 2)),
  theme(plot.margin = margin(t = 5.5, r = 5.5, b = 5.5, l = 22, unit = "pt")),
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  )

#Manually add the apparent biological meaning to each PC.
pc.labs <- c("PC1: Neutrophils", "PC2: T cell naive/memory", "PC3: Treg/CD4Tmem", "PC4: Bmem/Mono", "PC5: Bnv/Eos", "PC6: Naive CD8 T cells", "PC7: Mixed (CD4Tnv/CD8Tnv)", "PC8: Eos/Mono", "PC9: Mixed (CD4Tnv+CD8Tmem)", "PC10: Mixed (Bmem)", "PC11: Mixed (Baso)")
res.pcs$PC.label <- pc.labs
res.pcs$PC.label <- factor(res.pcs$PC.label, levels = unique(res.pcs$PC.label))

#Intercept and variance explained.
ggplot(res.pcs, aes(x = PC.label, y = estimate)) +
  plot.theme +
  annotate("text", x = 0.6, y = 5.0, label = paste0("Intercept: ", pc.int), hjust = 0, vjust = 0.6) +
  annotate("text", x = 11.4, y = 5.0, label = var.ex, hjust = 1, vjust = 0.6)

Leave out PC2 to demonstrate that this doesn’t affect effect sizes for the remaining PCs (in contrast to the analysis using individual cell counts).

fit <- lm(formula = age ~ PC1 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11, data = ss.pc)
summary(fit)
## 
## Call:
## lm(formula = age ~ PC1 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + 
##     PC9 + PC10 + PC11, data = ss.pc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.836 -11.535   1.016  11.511  48.164 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.15771    0.23480 217.875  < 2e-16 ***
## PC1          0.02333    0.11838   0.197 0.843781    
## PC3         -0.72825    0.21247  -3.427 0.000615 ***
## PC4          0.99617    0.23577   4.225 2.44e-05 ***
## PC5         -1.95336    0.27097  -7.209 6.70e-13 ***
## PC6         -6.28966    0.27421 -22.938  < 2e-16 ***
## PC7          4.35027    0.28592  15.215  < 2e-16 ***
## PC8         -0.44157    0.29190  -1.513 0.130430    
## PC9          0.83518    0.31881   2.620 0.008834 ** 
## PC10         0.09467    0.35384   0.268 0.789058    
## PC11         0.76573    0.38464   1.991 0.046571 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.96 on 4047 degrees of freedom
## Multiple R-squared:  0.174,  Adjusted R-squared:  0.1719 
## F-statistic: 85.24 on 10 and 4047 DF,  p-value: < 2.2e-16
test.var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")

res <- as.data.frame(summary(fit)$coefficients)
colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")

#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)

res[,1] <- round(res[,1], 2)
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 1)
res[,5] <- round(res[,5], 2)
res[,6] <- round(res[,6], 2)

# #Remove the intercept and the age effect, selecting only cell type effects.
test.pc.int <- res[1,1]
res <- res[-(1),]

#Add a dummy PC2.
res <- rbind(res[1,], c(NaN, NaN, NaN, NaN, NaN, NaN), res[-1,])
rownames(res)[2] <- "PC2"

res$PC <- factor(rownames(res), levels = rownames(res))

res.test <- res
res.test
##      estimate std.error t.stat         p.val ci.lower ci.upper   PC
## PC1      0.02      0.12    0.2  8.437812e-01    -0.21     0.26  PC1
## PC2       NaN       NaN    NaN           NaN      NaN      NaN  PC2
## PC3     -0.73      0.21   -3.4  6.153566e-04    -1.14    -0.31  PC3
## PC4      1.00      0.24    4.2  2.439152e-05     0.53     1.46  PC4
## PC5     -1.95      0.27   -7.2  6.700323e-13    -2.48    -1.42  PC5
## PC6     -6.29      0.27  -22.9 1.435043e-109    -6.83    -5.75  PC6
## PC7      4.35      0.29   15.2  7.040991e-51     3.79     4.91  PC7
## PC8     -0.44      0.29   -1.5  1.304304e-01    -1.01     0.13  PC8
## PC9      0.84      0.32    2.6  8.833748e-03     0.21     1.46  PC9
## PC10     0.09      0.35    0.3  7.890583e-01    -0.60     0.79 PC10
## PC11     0.77      0.38    2.0  4.657150e-02     0.01     1.52 PC11
#Manually add the apparent biological meaning to each PC.
res.test$PC.label <- pc.labs
res.test$PC.label <- factor(res.test$PC.label, levels = unique(res.test$PC.label))

#Intercept and variance explained.
ggplot(res.test, aes(x = PC.label, y = estimate)) +
  plot.theme +
  annotate("text", x = 0.6, y = 5.0, label = paste0("Intercept: ", pc.int), hjust = 0, vjust = 0.6) +
  annotate("text", x = 11.4, y = 5.0, label = test.var.ex, hjust = 1, vjust = 0.6) +
  geom_vline(data = subset(res.test, is.na(estimate)), aes(xintercept = PC.label), color = "red", alpha = 0.5, linetype = "dashed")
## Warning: Removed 1 rows containing missing values (`geom_point()`).


SessionInfo

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Amsterdam
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggpubr_0.6.0     reshape2_1.4.4   shadowtext_0.1.3 ggplot2_3.4.3   
## [5] dplyr_1.1.2     
## 
## loaded via a namespace (and not attached):
##  [1] tidyr_1.3.0       sass_0.4.7        utf8_1.2.3        generics_0.1.3   
##  [5] rstatix_0.7.2     stringi_1.7.12    lattice_0.21-8    digest_0.6.33    
##  [9] magrittr_2.0.3    evaluate_0.21     grid_4.3.1        fastmap_1.1.1    
## [13] plyr_1.8.8        jsonlite_1.8.7    Matrix_1.6-1      backports_1.4.1  
## [17] mgcv_1.8-42       purrr_1.0.2       fansi_1.0.4       scales_1.2.1     
## [21] jquerylib_0.1.4   abind_1.4-5       cli_3.6.1         rlang_1.1.1      
## [25] munsell_0.5.0     splines_4.3.1     withr_2.5.0       cachem_1.0.8     
## [29] yaml_2.3.7        tools_4.3.1       ggsignif_0.6.4    colorspace_2.1-0 
## [33] broom_1.0.5       vctrs_0.6.3       R6_2.5.1          lifecycle_1.0.3  
## [37] stringr_1.5.0     car_3.1-2         pkgconfig_2.0.3   pillar_1.9.0     
## [41] bslib_0.5.1       gtable_0.3.4      glue_1.6.2        Rcpp_1.0.11      
## [45] xfun_0.40         tibble_3.2.1      tidyselect_1.2.0  highr_0.10       
## [49] rstudioapi_0.15.0 knitr_1.43        farver_2.1.1      htmltools_0.5.6  
## [53] nlme_3.1-162      carData_3.0-5     rmarkdown_2.24    labeling_0.4.2   
## [57] compiler_4.3.1